In [1]:
import pandas as pd
import numpy as np
import sys
%matplotlib inline
In [2]:
print 'Python version ' + sys.version
print 'Pandas version ' + pd.__version__
In [3]:
df = pd.DataFrame({'group1':["a","a","b","b"],
'value':[10,20,30,40]
})
df
Out[3]:
In [4]:
group = df.groupby('group1')
group.agg([len,sum])
Out[4]:
In [5]:
df = pd.DataFrame({'labels':["a","a","b","b"],
'value':[10,20,30,40]
})
df
Out[5]:
In [6]:
group = df.groupby('labels')['value']
df['value.sum'] = group.transform('sum')
df
Out[6]:
In [7]:
df = pd.DataFrame({'col1':[pd.Timestamp('20130102000030'),
pd.Timestamp('2013-02-03 00:00:30'),
pd.Timestamp('3/4/2013 000030')]
})
df
Out[7]:
In [8]:
df['MonthNumber'] = df['col1'].apply(lambda x: x.month)
df['Day'] = df['col1'].apply(lambda x: x.day)
df['Year'] = df['col1'].apply(lambda x: x.year)
df['MonthName'] = df['col1'].apply(lambda x: x.strftime('%B'))
df['WeekDay'] = df['col1'].apply(lambda x: x.strftime('%A'))
df
Out[8]:
In [9]:
df = pd.DataFrame({'col1':['minus','minus','positive','nan'],
'col2':[10,20,30,40]
})
df
Out[9]:
In [10]:
df['col3'] = df['col2']*df['col1'].apply(lambda x: -1 if x=='minus' else (1 if x=='positive' else np.nan))
df
Out[10]:
In [11]:
df = pd.DataFrame({'group1':['a','a','a','b','b','b'],
'group2':['c','c','d','d','d','e'],
'value1':[1.1,2,3,4,5,6],
'value2':[7.1,8,9,10,11,12]
})
df
Out[11]:
In [12]:
group = df.groupby(['group1','group2'])
def Half(x):
return x.sum()
df['new'] = group['value1'].transform(Half)
df
Out[12]:
In [13]:
# For multiple functions
def HalfPlus(x):
return x.sum() + 1
newcol = group['value1'].agg([Half,HalfPlus])
newcol
Out[13]:
In [14]:
df.merge(newcol, left_on=['group1','group2'], right_index=True)
Out[14]:
In [15]:
df1 = pd.DataFrame(data=[26371, 1755, 2], index=[-9999, 240, 138.99], columns=['value'])
df1
Out[15]:
In [16]:
df2 = pd.DataFrame(data=[26371, 1755, 6, 4], index=[-9999, 240, 113.03, 110], columns=['value'])
df2
Out[16]:
In [17]:
# If you simply add them, you will get null values
# were the index does not match
df1 + df2
Out[17]:
In [18]:
# Here we fix this issue
df1.add(df2, fill_value=0)
Out[18]:
Author: David Rojas